plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
This script read in repeatmasker output file, most updated version of all species, zrad and mcic already removed low complexity and simple repeat. And have length column, and remove overlapping TEs. 1. Recent element analysis 2. Biggest families analysis
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(ggsci)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
mcic <- read_tsv("~/bigdata/TE_composition-EDTA/tables/McicTEtableV3.tsv")
## Rows: 2393218 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): query, leftq, strand, family, class, beginr, leftr, overlap
## dbl (9): score, div., del., ins., beginq, endq, endr, ID, length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zrad <- read_tsv("~/bigdata/TE_composition-EDTA/tables/ZradTEtableV3.tsv")
## Rows: 748260 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): query, leftq, strand, family, class, beginr, leftr, overlap
## dbl (9): score, div., del., ins., beginq, endq, endr, ID, length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
emus <- read_table("~/bigdata/EDTA/RepeatLandscape2-EDTA/Entomophthora_muscae_UCB.Nanopore10X_v2.rmblast/Entomophthora_muscae_UCB.Nanopore10X_v2.fasta.out",skip=3,col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character(),
## X11 = col_character(),
## X12 = col_character(),
## X13 = col_double(),
## X14 = col_character(),
## X15 = col_double(),
## X16 = col_character()
## )
colnames(emus) <-c("score","div.","del.","ins.","query","beginq","endq","leftq","strand","family","class","beginr","endr","leftr","ID","overlap")
emus <- emus %>% mutate(length=endq-beginq +1)
emus1 <- subset(emus,class != "Low_complexity")
emus2 <- subset(emus1,class != "Simple_repeat")
emai <- read_table("~/bigdata/EDTA/RepeatLandscape2-EDTA/Entomophaga_maimaiga_var_ARSEF_7190.rmblast1/Entomophaga_maimaiga_var_ARSEF_7190.fasta.out",skip=3,col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character(),
## X11 = col_character(),
## X12 = col_character(),
## X13 = col_double(),
## X14 = col_character(),
## X15 = col_double(),
## X16 = col_character()
## )
colnames(emai) <-c("score","div.","del.","ins.","query","beginq","endq","leftq","strand","family","class","beginr","endr","leftr","ID","overlap")
emai <- emai %>% mutate(length=endq-beginq +1)
emai1 <- subset(emai,class != "Low_complexity")
emai2 <- subset(emai1,class != "Simple_repeat")
remove overlapping counts for all species
mcic$overlap[is.na(mcic$overlap)] <- 1
mcicNO<-mcic %>% filter(overlap != "*")
zrad$overlap[is.na(zrad$overlap)] <- 1
zradNO<-zrad %>% filter(overlap != "*")
emus2$overlap[is.na(emus2$overlap)] <- 1
emusNO<-emus2 %>% filter(overlap != "*")
emai2$overlap[is.na(emai2$overlap)] <- 1
emaiNO<-emai2 %>% filter(overlap != "*")
Now look for recent elements in these species, divergence < 5% First Zrad
recentzrad <- zradNO %>% filter(div. < 5)
recentzrad %>% group_by(class) %>% summarise(Zrad=n_distinct(ID)) %>%arrange(class)
And Emus
recentemus <- emusNO %>% filter(div. < 5)
recentemus %>% group_by(class) %>% summarise(Emus=n_distinct(ID)) %>%arrange(class)
And Emai
recentemai <- emaiNO %>% filter(div. < 5)
recentemai %>% group_by(class) %>% summarise(Emai=n_distinct(ID)) %>%arrange(class)
recentmcic <- mcic %>% filter(div. < 5)
recentmcic %>% group_by(class) %>% summarise(Mcic=n_distinct(ID)) %>%arrange(class)
each table add a species column and join those tables together!
recentclasszrad <- recentzrad %>% group_by(class) %>% summarise(Zrad=n_distinct(ID)) %>%arrange(class)
recentclassmcic <- recentmcic %>% group_by(class) %>% summarise(Mcic=n_distinct(ID)) %>%arrange(class)
recentclassemai <- recentemai %>% group_by(class) %>% summarise(Emai=n_distinct(ID)) %>%arrange(class)
recentclassemus <- recentemus %>% group_by(class) %>% summarise(Emus=n_distinct(ID)) %>%arrange(class)
recent1 <- full_join(recentclassemus,recentclassemai)
## Joining with `by = join_by(class)`
recent2<- full_join(recent1,recentclassmcic)
## Joining with `by = join_by(class)`
full_join(recent2,recentclasszrad)
## Joining with `by = join_by(class)`
For visual maybe show 5 DNA classes, omit any have less than 100 in every species in the bar plot. add a table seperating TIR and MITE.
recent3 <- full_join(recent2,recentclasszrad)
## Joining with `by = join_by(class)`
How about recent families
recentzrad %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentmcic %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentemai %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentemus %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
For massospora, biggest families by length, but not overlapping
head(mcic)
mcicNO$family<-sub("_INT","",mcicNO$family)
mcicNO$family<-sub("_LTR","",mcicNO$family)
mcicNO <- mcicNO %>% mutate(familyname = paste("Mcic",mcicNO$family,mcicNO$class))
mcicNO %>% group_by(familyname,class) %>%
summarise(coverage=sum(length),n=n_distinct(ID)) %>%
mutate(percentage=(coverage/1488883982)*100) %>%
arrange(desc(coverage))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.
What is the biggest families by number
mcicNO %>% group_by(familyname,class) %>%
summarise(coverage=sum(length),n=n_distinct(ID)) %>%
mutate(percentage=(coverage/1488883982)*100) %>%
arrange(desc(n))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.
mcicbig <- mcicNO %>% group_by(familyname,class) %>%
summarise(coverage=sum(length),n=n_distinct(ID)) %>%
mutate(percentage=(coverage/1488883982)*100) %>%
arrange(desc(coverage))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.
mcictop25 <- head(mcicbig,25)
mcictop25 %>%
ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
geom_bar(stat="identity", fill="#00A087FF") +
theme_bw()+
xlab("TE family")+
ylab("genome coverage %")+
coord_flip() +
ggtitle("M.cicadina top 25 families")
Jan 3 2024
Use statistics from build summary function from repeatmasker
mcicfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Mcicfamilysummary.txt",col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character(),
## X5 = col_logical()
## )
colnames(mcicfam)<-c("family","count","bp","percentage","unknown")
head(mcicfam)
mcicfam1<-mcic %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
mcicfam2<-left_join(mcicfam,mcicfam1,join_by(family))
head(mcicfam2)
remove INT and LTR from family name, and add a column with species and classification name to it
mcicfam2$family<-sub("_INT","",mcicfam2$family)
mcicfam2$family<-sub("_LTR","",mcicfam2$family)
mcicfam2 <- mcicfam2 %>% mutate(familyname = paste("Mcic",mcicfam2$family,mcicfam2$class))
mcicfam2 %>% arrange(desc(percentage))
mcicfam2 %>% arrange(desc(count))
mcicbig1<-mcicfam2 %>% arrange(desc(percentage))
mcictop <- head(mcicbig1,25)
mcictop$familyname<-sub("DTM","Mutator",mcictop$familyname)
mcictop$familyname<-sub("DTC","CMC",mcictop$familyname)
mcictop$percentage <- sub("%","",mcictop$percentage)
mcictop$percentage<-as.numeric(mcictop$percentage)
mcictop %>%
ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
geom_bar(stat="identity", fill="#00A087FF") +
theme_bw()+
xlab("TE family")+
ylab("genome coverage %")+
coord_flip() +
ggtitle("M.cicadina top 25 families")
Biggest families by number
mcictop1<-mcicfam2 %>% arrange(desc(count))
mcictopnum<-head(mcictop1,25)
mcictopnum$familyname<-sub("DTM","Mutator",mcictopnum$familyname)
mcictopnum$familyname<-sub("DTC","CMC",mcictopnum$familyname)
mcictopnum %>%
ggplot(aes(x=reorder(familyname,count), y=count)) +
geom_bar(stat="identity", fill="#00A087FF") +
theme_bw()+
xlab("TE family")+
ylab("count number")+
coord_flip() +
ggtitle("M.cicadina top 25 families by abundance")
For Emus
emusfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Emusfamilysummary.txt",col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character(),
## X5 = col_logical()
## )
colnames(emusfam)<-c("family","count","bp","percentage","unknown")
head(emusfam)
emusfam1<-emus2 %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
emusfam2<-left_join(emusfam,emusfam1,join_by(family))
head(emusfam2)
remove INT and LTR from family name, and add a column with species and classification name to it
emusfam2$family<-sub("_INT","",emusfam2$family)
emusfam2$family<-sub("_LTR","",emusfam2$family)
emusfam2 <- emusfam2 %>% mutate(familyname = paste("Emus",emusfam2$family,emusfam2$class))
emusfam2 %>% arrange(desc(percentage))
emusfam2 %>% arrange(desc(count))
emusbig1<-emusfam2 %>% arrange(desc(percentage))
emustop <- head(emusbig1,25)
emustop$familyname<-sub("DTM","Mutator",emustop$familyname)
emustop$familyname<-sub("DTC","CMC",emustop$familyname)
emustop$percentage <- sub("%","",emustop$percentage)
emustop$percentage<-as.numeric(emustop$percentage)
emustop %>%
ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
geom_bar(stat="identity", fill="#4DBBD5FF") +
theme_bw()+
xlab("TE family")+
ylab("genome coverage %")+
coord_flip() +
ggtitle("E.muscae top 25 families")
emustop1<-emusfam2 %>% arrange(desc(count))
emustopnum<-head(emustop1,25)
emustopnum$familyname<-sub("DTM","Mutator",emustopnum$familyname)
emustopnum$familyname<-sub("DTC","CMC",emustopnum$familyname)
emustopnum$familyname<-sub("DTT","TcMar",emustopnum$familyname)
emustopnum$familyname<-sub("DTA","hAT",emustopnum$familyname)
emustopnum %>%
ggplot(aes(x=reorder(familyname,count), y=count)) +
geom_bar(stat="identity", fill="#4DBBD5FF") +
theme_bw()+
xlab("TE family")+
ylab("count number")+
coord_flip() +
ggtitle("E.muscae top 25 families by abundance")
For Emai
emaifam<-read_table("~/bigdata/TE_composition-EDTA/tables/Emaifamilysummary.txt",col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character(),
## X5 = col_logical()
## )
colnames(emaifam)<-c("family","count","bp","percentage","unknown")
head(emaifam)
emaifam1<-emai2 %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
emaifam2<-left_join(emaifam,emaifam1,join_by(family))
head(emaifam2)
remove INT and LTR from family name, and add a column with species and classification name to it
emaifam2$family<-sub("_INT","",emaifam2$family)
emaifam2$family<-sub("_LTR","",emaifam2$family)
emaifam2 <- emaifam2 %>% mutate(familyname = paste("Emai",emaifam2$family,emaifam2$class))
emaifam2 %>% arrange(desc(percentage))
emaifam2 %>% arrange(desc(count))
emaibig1<-emaifam2 %>% arrange(desc(percentage))
emaitop <- head(emaibig1,25)
emaitop$familyname<-sub("DTM","Mutator",emaitop$familyname)
emaitop$familyname<-sub("DTC","CMC",emaitop$familyname)
emaitop$familyname<-sub("DTT","TcMar",emaitop$familyname)
emaitop$familyname<-sub("DTA","hAT",emaitop$familyname)
emaitop$percentage <- sub("%","",emaitop$percentage)
emaitop$percentage<-as.numeric(emaitop$percentage)
emaitop %>%
ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
geom_bar(stat="identity", fill="#E64B35FF") +
theme_bw()+
xlab("TE family")+
ylab("genome coverage %")+
coord_flip() +
ggtitle("E.maimaiga top 25 families")
emaitop1<-emaifam2 %>% arrange(desc(count))
emaitopnum<-head(emaitop1,25)
emaitopnum$familyname<-sub("DTM","Mutator",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTC","CMC",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTT","TcMar",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTA","hAT",emaitopnum$familyname)
emaitopnum %>%
ggplot(aes(x=reorder(familyname,count), y=count)) +
geom_bar(stat="identity", fill="#E64B35FF") +
theme_bw()+
xlab("TE family")+
ylab("count number")+
coord_flip() +
ggtitle("E.maimaiga top 25 families by abundance")
for Zrad
zradfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Zradfamilysummary.txt",col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character(),
## X5 = col_logical()
## )
colnames(zradfam)<-c("family","count","bp","percentage","unknown")
head(zradfam)
zradfam1<-zrad %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
zradfam2<-left_join(zradfam,zradfam1,join_by(family))
head(zradfam2)
remove INT and LTR from family name, and add a column with species and classification name to it
zradfam2$family<-sub("_INT","",zradfam2$family)
zradfam2$family<-sub("_LTR","",zradfam2$family)
zradfam2 <- zradfam2 %>% mutate(familyname = paste("Zrad",zradfam2$family,zradfam2$class))
zradfam2 %>% arrange(desc(percentage))
zradfam2 %>% arrange(desc(count))
zradbig1<-zradfam2 %>% arrange(desc(percentage))
zradtop <- head(zradbig1,25)
zradtop$familyname<-sub("DTM","Mutator",zradtop$familyname)
zradtop$familyname<-sub("DTC","CMC",zradtop$familyname)
zradtop$familyname<-sub("DTT","TcMar",zradtop$familyname)
zradtop$familyname<-sub("DTA","hAT",zradtop$familyname)
zradtop$percentage <- sub("%","",zradtop$percentage)
zradtop$percentage<-as.numeric(zradtop$percentage)
zradtop %>%
ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
geom_bar(stat="identity", fill="#3C5488FF") +
theme_bw()+
xlab("TE family")+
ylab("genome coverage %")+
coord_flip() +
ggtitle("Z.radicans top 25 families")
zradtop1<-zradfam2 %>% arrange(desc(count))
zradtopnum<-head(zradtop1,25)
zradtopnum$familyname<-sub("DTM","Mutator",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTC","CMC",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTT","TcMar",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTA","hAT",zradtopnum$familyname)
zradtopnum %>%
ggplot(aes(x=reorder(familyname,count), y=count)) +
geom_bar(stat="identity", fill="#3C5488FF") +
theme_bw()+
xlab("TE family")+
ylab("count number")+
coord_flip() +
ggtitle("Z.radicans top 25 families by abundance")
In non-overlapping count make idenctity column group_by ID,familyname,
summarise average identity subset the table only look at the interesting
family For M.cic let’s do top 10 families first
library(viridis)
## Loading required package: viridisLite
mcicNO<-mcicNO %>% mutate(Identity=100-div.)
mcicNO1<-mcicNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
mcictop10<-head(mcictop1,10)
mcictop10famname<-c(mcictop10$familyname)
mcictop10Iden<-subset(mcicNO1,familyname == mcictop10famname)
## Warning in familyname == mcictop10famname: longer object length is not a
## multiple of shorter object length
mcictop10Iden$familyname<-sub("DTM","Mutator",mcictop10Iden$familyname)
mcictop10Iden$familyname<-sub("DTC","CMC",mcictop10Iden$familyname)
mcictop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
geom_violin(width=1.7)+
geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
scale_fill_viridis(discrete = TRUE)+
theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals
DNA/DTC and DNA/DTM expanded during short burst, LTRs and Helitrons
expanded gradually during long period of time. For Z.rad
zradNO<-zradNO %>% mutate(Identity=100-div.)
zradNO$family<-sub("_INT","",zradNO$family)
zradNO$family<-sub("_LTR","",zradNO$family)
zradNO <- zradNO %>% mutate(familyname = paste("Zrad",zradNO$family,zradNO$class))
zradNO1<-zradNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
zradtop10<-head(zradtop1,10)
zradtop10famname<-c(zradtop10$familyname)
zradtop10Iden<-subset(zradNO1,familyname == zradtop10famname)
## Warning in familyname == zradtop10famname: longer object length is not a
## multiple of shorter object length
zradtop10Iden$familyname<-sub("DTM","Mutator",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTC","CMC",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
zradtop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
geom_violin(width=1.7)+
geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
scale_fill_viridis(discrete = TRUE)+
theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals
For Emus
emusNO<-emusNO %>% mutate(Identity=100-div.)
emusNO$family<-sub("_INT","",emusNO$family)
emusNO$family<-sub("_LTR","",emusNO$family)
emusNO <- emusNO %>% mutate(familyname = paste("Emus",emusNO$family,emusNO$class))
emusNO1<-emusNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
emustop10<-head(emustop1,10)
emustop10famname<-c(emustop10$familyname,"Emus TE_00001791 MITE/DTM")
emustop10Iden<-subset(emusNO1,familyname == emustop10famname)
emustop10Iden$familyname<-sub("DTM","Mutator",emustop10Iden$familyname)
emustop10Iden$familyname<-sub("DTC","CMC",emustop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
emustop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
geom_violin(width=1.7)+
geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
scale_fill_viridis(discrete = TRUE)+
theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals
For Emai
emaiNO<-emaiNO %>% mutate(Identity=100-div.)
emaiNO$family<-sub("_INT","",emaiNO$family)
emaiNO$family<-sub("_LTR","",emaiNO$family)
emaiNO <- emaiNO %>% mutate(familyname = paste("Emai",emaiNO$family,emaiNO$class))
emaiNO1<-emaiNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
emaitop10<-head(emaitop1,10)
emaitop10famname<-c(emaitop10$familyname,"Emai TE_00001229 MITE/DTM")
emaitop10Iden<-subset(emaiNO1,familyname == emaitop10famname)
## Warning in familyname == emaitop10famname: longer object length is not a
## multiple of shorter object length
emaitop10Iden$familyname<-sub("DTM","Mutator",emaitop10Iden$familyname)
emaitop10Iden$familyname<-sub("DTC","CMC",emaitop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
emaitop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
geom_violin(width=1.7)+
geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
scale_fill_viridis(discrete = TRUE)+
theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals